library(data.table)
library(tidyverse)
library(matrixStats)
library(lubridate)
library(ggraph)
library(ggrepel)
library(ape)
library(ggtree)
library(patchwork)
library(furrr)
library(caper)
library(lme4)
Load allele count data for each replicate. We only consider those that have generated high quality consensus genomes from both replicates.
run1_files <- Sys.glob("./data/allele_counts/run_1/*.tsv")
run2_files <- Sys.glob("./data/allele_counts/run_2/*.tsv")
replicate_meta <- fread("./Processed_data/keep_replicates_meta.csv", data.table = FALSE,
sep = ",", fill = TRUE) %>% as_tibble()
run1_files <- run1_files[gsub("\\_ac.tsv", "", gsub(".*/", "", run1_files)) %in%
replicate_meta$central_sample_id]
run2_files <- run2_files[gsub("\\_ac.tsv", "", gsub(".*/", "", run2_files)) %in%
replicate_meta$central_sample_id]
We only consider intra host single nucleotide variants (iSNVs) that are present with a MAF of at least 0.001 and at least 10 supporting reads in both replicates.
run1_list <- map(run1_files, function(x) {
df <- fread(x, data.table = FALSE) %>% as_tibble
df$sample <- rep(gsub("\\.tsv", "", gsub(".*/", "", x)), nrow(df))
return(df)
})
names(run1_list) <- gsub("_ac\\.tsv", "", gsub(".*/", "", run1_files))
run2_list <- map(run2_files, function(x) {
df <- fread(x, data.table = FALSE) %>% as_tibble
df$sample <- rep(gsub("\\.tsv", "", gsub(".*/", "", x)), nrow(df))
return(df)
})
names(run2_list) <- gsub("_ac\\.tsv", "", gsub(".*/", "", run2_files))
stopifnot(all(names(run1_list) %in% names(run2_list)))
all_sample_names <- names(run1_list)
run_consensus_df <- imap_dfr(all_sample_names, function(s, i) {
mafA <- run1_list[[s]]
mafB <- run2_list[[s]]
# check there are replicates
if (is.null(mafA) || nrow(mafA) < 29000) {
return(mafB)
}
if (is.null(mafB) || nrow(mafB) < 29000) {
return(mafA)
}
stopifnot(all(mafA$POS == mafB$POS))
countA <- sum(mafA$Good_depth)
countB <- sum(mafB$Good_depth)
inA <- mafA[, c("Count_A", "Count_C", "Count_G", "Count_T")] >= pmax(10, (0.001 *
mafA$Good_depth))
inB <- mafB[, c("Count_A", "Count_C", "Count_G", "Count_T")] >= pmax(10, (0.001 *
mafB$Good_depth))
if (countA > countB) {
mafA[, c("Count_A", "Count_C", "Count_G", "Count_T")][!(inA & inB)] <- 0
return(mafA)
} else {
mafB[, c("Count_A", "Count_C", "Count_G", "Count_T")][!(inA & inB)] <- 0
return(mafB)
}
})
run_consensus_df$sample <- gsub("_ac", "", run_consensus_df$sample)
run_consensus_df$Good_depth <- rowSums(run_consensus_df[, c("Count_A", "Count_C",
"Count_G", "Count_T")])
# fwrite(run_consensus_df, file =
# './Processed_data/consensus_allele_counts_mixture.csv')
run_consensus_df <- fread("./Processed_data/consensus_allele_counts_mixture.csv",
data.table = FALSE)
Calculate sample counts
stopifnot(all(replicate_meta$central_sample_id %in% names(run1_list)))
temp <- replicate_meta %>% filter(central_sample_id %in% names(run1_list)) %>% group_by(biosample_source_id) %>%
summarise(count = n()) %>% filter(biosample_source_id != "None")
sample_counts <- tibble(nsamples = length(run1_files), nhosts = length(run1_files) -
sum(temp$count - 1), nhosts_with_multi = sum(temp$count > 1))
knitr::kable(sample_counts)
| nsamples | nhosts | nhosts_with_multi |
|---|---|---|
| 1032 | 981 | 43 |
We now filter iSNVs that were not found using shearwater. We also exclude samples CAMB-728D4 and CAMB-79345 as they had an unusually high number of minority variants (these have already been filtered from the shearwater results).
shearwater_calls <- fread("./data/Shearwater_calls_20200714_annot.txt") %>% as_tibble() %>%
filter(!mut %in% c("-", "INS")) %>% filter(str_length(ref) == str_length(mut))
shearwater_calls <- map_dfr(1:nrow(shearwater_calls), function(i) {
temp <- map2_dfr(unlist(str_split(shearwater_calls$ref[[i]], "")), unlist(str_split(shearwater_calls$mut[[i]],
"")), ~{
temp <- shearwater_calls[i, ]
temp$ref <- .x
temp$mut <- .y
return(temp)
})
temp$pos <- temp$pos[[1]] + 0:(nrow(temp) - 1)
return(temp)
})
shearwater_complex_calls <- fread("./data/Shearwater_calls_20200714_annot.txt", data.table = FALSE) %>%
as_tibble()
length(unique(shearwater_complex_calls$pos[shearwater_complex_calls$vaf >= 0.95]))
## [1] 948
length(unique(shearwater_complex_calls$pos[shearwater_complex_calls$vaf < 0.95]))
## [1] 5625
run_consensus_df <- run_consensus_df %>% filter(sample %in% unique(shearwater_calls$sampleID))
colnames(run_consensus_df) <- c("CHR", "POS", "A", "C", "G", "T", "Good_depth", "sample")
# filter shearwater calls to include just those with good replicates
shearwater_calls <- shearwater_calls %>% filter(sampleID %in% unique(run_consensus_df$sample))
sample_allele_counts <- split(run_consensus_df, run_consensus_df$sample)
run_consensus_df <- map_dfr(unique(shearwater_calls$sampleID), function(sample) {
# consensus of lofreq calls
sw_calls <- shearwater_calls %>% filter(sampleID == sample)
sample_count <- sample_allele_counts[[sample]]
consA <- (sample_count[, c("A", "C", "G", "T")] > 0) & (sample_count[, c("A",
"C", "G", "T")] == rowMaxs(as.matrix(sample_count[, c("A", "C", "G", "T")])))
rownames(consA) <- sample_count$POS
sw_calls <- sw_calls %>% filter(pos %in% sample_count$POS)
consA[cbind(sw_calls$pos, sw_calls$mut)] <- TRUE
consA[cbind(sw_calls$pos, sw_calls$ref)] <- TRUE
sample_count[, c("A", "C", "G", "T")][!consA] <- 0
return(sample_count)
})
Only consider those calls with a coverage of at least 1000. Split into variable and consensus sites.
MINMAF <- 0.01
MINDEPTH <- 1000
# Store all counts
all_run_consensus_df <- run_consensus_df
# Filter to those variable sites
run_consensus_df <- run_consensus_df %>% filter(Good_depth >= MINDEPTH)
keep <- rowSums((run_consensus_df[, c("A", "C", "G", "T")]/run_consensus_df$Good_depth) >=
MINMAF) > 1
run_consensus_df <- as_tibble(run_consensus_df[keep, ])
We ensure only the initial sample for each patient is considered.
meta_multi <- replicate_meta %>% filter(biosample_source_id != "None") %>% group_by(biosample_source_id) %>%
summarise(samples = list(central_sample_id[order(collection_date)]), dates = list(collection_date[order(collection_date)]),
n_samples = length(unique(central_sample_id)), n_dates = length(unique(collection_date))) %>%
filter(n_samples > 1)
repeated_samples <- unlist(map(meta_multi$samples, ~.x[-1]))
Potential mixed infections were identified using a linear model by testing whether the allele frequencies in a sample could be better explained by the inclusion of an additional consensus genome from the COG-UK dataset of the 29th May 2020. Additional samples mixtures were considered if the addition of a COG-UK consensus genome could explain at least 2 iSNVs and have at most 1 variant that was not found in the alleles of the sample. This identified 54 putative mixtures which were then screened manually to obtain 36 potentially mixed samples.
fwrite(run_consensus_df, "./Processed_data/run_consensus_mixture.csv", sep = ",",
quote = FALSE, row.names = FALSE)
run_consensus_df <- fread("./Processed_data/run_consensus_mixture.csv", data.table = FALSE) %>%
as_tibble()
all_seqs <- ape::read.dna("./data/consensus/MA_elan.20200529.consensus.filt.fasta",
format = "fasta")
aln_length <- ncol(all_seqs)
all_seqs[!all_seqs %in% as.DNAbin(c("a", "c", "g", "t"))] <- as.DNAbin("n")
names <- unlist(fread(file = "./data/consensus/snp_dist_MA_elan.20200529.consensus.filt.mask.csv",
header = F, nrows = 1, sep = "\t"))
snp_dist <- fread("./data/consensus/snp_dist_MA_elan.20200529.consensus.filt.mask.csv",
data.table = FALSE, skip = 1, header = FALSE) %>% as_tibble()
snp_dist$V1 <- names[snp_dist$V1 + 2]
snp_dist$V2 <- names[snp_dist$V2 + 2]
dups <- snp_dist %>% group_by(V1) %>% summarise(dups = list(V2))
dups <- unlist(dups$dups)
all_seqs_dedup <- rownames(all_seqs)[!rownames(all_seqs) %in% dups]
all_samples <- unique(run_consensus_df$sample)
plan(multiprocess)
mixture_tests <- furrr::future_map_dfr(all_samples, function(sA) {
print(sA)
aA_orig <- all_seqs[rownames(all_seqs) == sA, ]
mA_orig <- matrix(0, nrow = 4, ncol = aln_length)
mA_orig[1, aA_orig == as.DNAbin("a")] <- 1
mA_orig[2, aA_orig == as.DNAbin("c")] <- 1
mA_orig[3, aA_orig == as.DNAbin("g")] <- 1
mA_orig[4, aA_orig == as.DNAbin("t")] <- 1
sfA_orig <- run_consensus_df %>% filter(sample == sA)
fA_orig <- mA_orig
fA_orig[, sfA_orig$POS] <- t(sfA_orig[, c("A", "C", "G", "T")]/rowSums(sfA_orig[,
c("A", "C", "G", "T")]))
mA_orig[, sfA_orig$POS] <- 1 * t(t(fA_orig[, sfA_orig$POS, drop = FALSE]) ==
colMaxs(fA_orig[, sfA_orig$POS, drop = FALSE]))
pairwise_res <- map_dfr(1:length(all_seqs_dedup), ~{
print(.x)
sB <- all_seqs_dedup[[.x]]
aB <- all_seqs[rownames(all_seqs) == sB, ]
mB <- matrix(0, nrow = 4, ncol = aln_length)
mB[1, aB == as.DNAbin("a")] <- 1
mB[2, aB == as.DNAbin("c")] <- 1
mB[3, aB == as.DNAbin("g")] <- 1
mB[4, aB == as.DNAbin("t")] <- 1
keep <- (colSums(mA_orig) > 0) & (colSums(mB) > 0)
mA <- mA_orig[, keep]
mB <- mB[, keep]
fA <- fA_orig[, keep]
model_data <- tibble(freq = c(unlist(fA)), consensusA = c(unlist(mA)), consensusB = c(unlist(mB)))
model_data <- model_data[(rowSums(model_data == 1) != 3) & (rowSums(model_data ==
0) != 3), , drop = FALSE]
# model_data <-
# model_data[model_data$consensusA!=model_data$consensusB,,drop=FALSE]
n_extra_explained <- sum(((model_data$freq > 0) & (model_data$consensusB >
0)) & (model_data$consensusA <= 0))
n_mismatch <- sum((model_data$freq <= 0) & (model_data$consensusB > 0))
if (nrow(model_data) < 1) {
return(tibble(sampleA = sA, sampleB = sB, estimate = NA, std.error = NA,
statistic = NA, p.value = NA, n_extra_explained = n_extra_explained,
n_mismatch = n_mismatch))
}
model <- broom::tidy(lm(freq ~ -1 + consensusA + consensusB, model_data))[2,
] %>% add_column(sampleB = sB, .before = 1) %>% add_column(sampleA = sA,
.before = 1)
model$term <- NULL
model$n_extra_explained <- n_extra_explained
model$n_mismatch <- n_mismatch
return(model)
}) %>% arrange(p.value) %>% filter(n_mismatch < 3)
return(pairwise_res)
}, .progress = TRUE)
write.csv(mixture_tests, "./Processed_data/mixture_tests_revised.csv", sep = ",",
quote = FALSE, row.names = FALSE)
Load the preprocessed results from the code above.
mixture_tests <- fread("./Processed_data/mixture_tests_revised.csv", data.table = FALSE) %>%
as_tibble()
mixture_tests_mm <- mixture_tests %>% filter(n_mismatch < 2) %>% filter(n_extra_explained >
1) %>% filter(estimate > 0) %>% arrange(p.value)
mixture_tests_mm <- mixture_tests_mm[!duplicated(mixture_tests_mm$sampleA), ]
# remove those that on manual inspection don't appear to be convincingly mixtures
mixture_tests_mm <- mixture_tests_mm[!mixture_tests_mm$sampleA %in% c("CAMB-7A326",
"CAMB-75E93", "CAMB-71F76", "CAMB-71D30", "CAMB-7657F", "CAMB-776F3", "CAMB-76B31",
"CAMB-76BC8", "CAMB-7A71B", "CAMB-7780C", "CAMB-72494", "CAMB-79433", "CAMB-77CB5",
"CAMB-775E7", "CAMB-75FFA", "CAMB-7A890", "CAMB-76B9B", "CAMB-7674C", "CAMB-77FBC"),
]
generate plots of the resulting mixtures
all_meta <- fread("./data/consensus/cog_2020-05-08_metadata.csv", data.table = FALSE) %>%
as_tibble()
all_meta$sample <- map_chr(str_split(all_meta$sequence_name, "/"), ~.x[[2]])
mixture_tests_mm$lineageA <- all_meta$lineage[match(mixture_tests_mm$sampleA, all_meta$sample)]
mixture_tests_mm$lineageB <- all_meta$lineage[match(mixture_tests_mm$sampleB, all_meta$sample)]
aln_length <- ncol(all_seqs)
pair_plots_data <- map_dfr(1:nrow(mixture_tests_mm), function(i) {
print(i)
sA <- mixture_tests_mm$sampleA[[i]]
sB <- mixture_tests_mm$sampleB[[i]]
aA_orig <- all_seqs[rownames(all_seqs) == sA, ]
mA_orig <- matrix(0, nrow = 4, ncol = aln_length)
mA_orig[1, aA_orig == as.DNAbin("a")] <- 1
mA_orig[2, aA_orig == as.DNAbin("c")] <- 1
mA_orig[3, aA_orig == as.DNAbin("g")] <- 1
mA_orig[4, aA_orig == as.DNAbin("t")] <- 1
sfA_orig <- run_consensus_df %>% filter(sample == sA)
fA_orig <- mA_orig
fA_orig[, sfA_orig$POS] <- t(sfA_orig[, c("A", "C", "G", "T")]/rowSums(sfA_orig[,
c("A", "C", "G", "T")]))
mA_orig[, sfA_orig$POS] <- 1 * t(t(fA_orig[, sfA_orig$POS, drop = FALSE]) ==
colMaxs(fA_orig[, sfA_orig$POS, drop = FALSE]))
aB <- all_seqs[rownames(all_seqs) == sB, ]
mB <- matrix(0, nrow = 4, ncol = aln_length)
mB[1, aB == as.DNAbin("a")] <- 1
mB[2, aB == as.DNAbin("c")] <- 1
mB[3, aB == as.DNAbin("g")] <- 1
mB[4, aB == as.DNAbin("t")] <- 1
keep <- (colSums(mA_orig) > 0) & (colSums(mB) > 0)
colnames(mA_orig) <- 1:ncol(mA_orig)
colnames(mB) <- 1:ncol(mB)
colnames(fA_orig) <- 1:ncol(fA_orig)
mA <- mA_orig[, keep]
mB <- mB[, keep]
fA <- fA_orig[, keep]
model_data <- tibble(pos = rep(as.numeric(colnames(fA)), each = 4), base = rep(c("A",
"C", "G", "T"), ncol(fA)), freq = c(unlist(fA)), consensusA = c(unlist(mA)),
consensusB = c(unlist(mB)))
colnames(model_data)[3:5] <- c("frequency", "consensus", "co-infection")
# sprintf('%s, lineage %s', sA, mixture_tests_mm$lineageA[[i]]), sprintf('%s,
# lineage %s', sB, mixture_tests_mm$lineageB[[i]]))#,
# mixture_tests_mm$n_extra_explained[[i]]))
model_data <- model_data[(rowSums(model_data[, 3:5] == 1) != 3) & (rowSums(model_data[,
3:5] == 0) != 3), ]
model_data$lineage <- "other"
model_data$lineage[model_data[, 5] == 1] <- colnames(model_data)[[5]]
model_data$lineage[model_data[, 4] == 1] <- colnames(model_data)[[4]]
model_data$pair <- sprintf("%s (%s) - %s (%s)", sA, mixture_tests_mm$lineageA[[i]],
sB, mixture_tests_mm$lineageB[[i]])
return(model_data)
})
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
pair_plots_data$lineage <- factor(pair_plots_data$lineage, levels = c("consensus",
"co-infection", "other"))
gg <- ggplot(pair_plots_data, aes(x = pos, y = frequency, col = lineage)) + geom_point(size = 2,
alpha = 0.5) + scale_x_continuous(limits = c(1, 29903)) + scale_y_continuous(breaks = c(0,
0.25, 0.5, 0.75, 1)) + scale_color_manual(values = c("#4393c3", "#d6604d", "#bdbdbd")) +
facet_wrap(~pair, ncol = 5) + ylab("freq") + theme_bw(base_size = 12) + xlab("position") +
ylab("frequency")
gg
# pp <- patchwork::wrap_plots(pair_plots,guides = 'collect') +
# patchwork::plot_layout(ncol = 1)
ggsave(gg, filename = "Figures/potential_mixtures_revised.pdf", width = 20, height = 20,
limitsize = FALSE)
gg <- ggplot(pair_plots_data %>% filter(pair %in% c("CAMB-7217F (B.1) - CAMB-79EF9 (B.3)",
"CAMB-7A57B (B.2.1) - CAMB-75ADB (B.1.1)", "CAMB-75BE7 (B.3) - CAMB-76621 (B.3)")),
aes(x = pos, y = frequency, col = lineage)) + geom_point(size = 2, alpha = 0.5) +
scale_x_continuous(limits = c(1, 29903)) + scale_y_continuous(breaks = c(0, 0.25,
0.5, 0.75, 1)) + scale_color_manual(values = c("#4393c3", "#d6604d", "#bdbdbd")) +
facet_wrap(~pair, ncol = 1) + ylab("freq") + theme_bw(base_size = 16) + xlab("position") +
ylab("frequency")
gg
ggsave(gg, filename = "Figures/potential_mixtures_selected_3.png", width = 12, height = 7)
ggsave(gg, filename = "Figures/potential_mixtures_selected_3.pdf", width = 12, height = 7)
To help ascertain whether common intrahost variants are the result of independent mutation events rather than transmission we generate heatmaps for a selection of variants which indicate the genomic background in which they occur. If these variants were the result of transmission we would expect to see the genetic background of fixed variants after aligning to the reference. That is, given an intrahost variant we would expect to see highly similar consensus sequences. The heatmaps below indicate that this is generally not the case and thus independent mutation events better explain many of the shared intrahost variants between samples.
shearwater_calls_no_rep <- shearwater_calls %>% filter(!sampleID %in% repeated_samples)
shearwater_calls_no_rep$pos_change <- sprintf("%s%i%s", shearwater_calls_no_rep$ref,
shearwater_calls_no_rep$pos, shearwater_calls_no_rep$mut)
plot_mixed_vaf <- function(pos) {
temp_samples <- unique(shearwater_calls_no_rep$sampleID[(shearwater_calls_no_rep$pos_change ==
pos) & (shearwater_calls_no_rep$vaf < 0.95) & (shearwater_calls_no_rep$vaf >
0)])
pdf <- shearwater_calls_no_rep %>% filter(sampleID %in% temp_samples)
pdf$pos_change <- factor(pdf$pos_change, levels = c(pos, unique(pdf$pos_change)[unique(pdf$pos_change) !=
pos]))
gg <- ggplot(pdf, aes(x = pos_change, y = sampleID, fill = vaf)) + geom_tile() +
theme_bw(base_size = 14) + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
ylab("sample") + xlab("variant") + # scale_fill_stepsn(colours =
# c('#ca0020','#f4a582','#f7f7f7','#92c5de','#0571b0'), breaks=c(0.25,0.5,0.75))
# +
scale_fill_binned(type = "viridis", breaks = c(0.25, 0.5, 0.75)) + ggtitle(pos)
gg
return(gg)
}
plot_mixed_vaf("A13780T")
ggsave("./Figures/heatmap_A13780T.png", height = 7, width = 10)
plot_mixed_vaf("C683T")
ggsave("./Figures/heatmap_C683T.png", height = 7, width = 10)
plot_mixed_vaf("A1547T")
ggsave("./Figures/heatmap_A1547T.png", height = 7, width = 10)
We call lineages using pangolin v1.1.14 and database date 2020-05-19.
lineage_calls <- fread("/Users/gt4/Documents/covid_deep/data/consensus/combined_consensus_replicates_filt_lineage_report.csv",
data.table = FALSE) %>% as_tibble()
stopifnot(all(run_consensus_df$sample %in% lineage_calls$taxon))
run_consensus_df$lineage <- lineage_calls$lineage[match(run_consensus_df$sample,
lineage_calls$taxon)]
To investigate transmission, samples were only considered if both replicates produced high quality consensus genomes. When multiple samples from the same host were available the earliest sample was used. Pairwise SNP distances were generated between the consensus genomes using pairsnp v0.2.0 (Tonkin-Hill 2018). The distribution of the underlying number of intermediate transmission events between each pair of samples was then inferred using an implementation of the transcluster algorithm (Stimson et al. 2019; Tonkin-Hill 2020). The serial interval and evolutionary rate were set to 5 days and 1e-3 substitutions/site/year (Fauver et al. 2020; Zhang et al. 2020).
replicate_meta <- fread("./Processed_data/keep_replicates_meta.csv", data.table = FALSE,
sep = ",", fill = TRUE) %>% as_tibble()
consensus_seqs <- ape::read.dna("./data/consensus/combined_consensus_replicates_filt.fa",
format = "fasta")
meta <- fread("./data/replicate_meta.tsv", data.table = FALSE) %>% as_tibble()
date_info <- replicate_meta[, c("central_sample_id", "collection_date")] %>% filter(collection_date !=
"None")
consensus_seqs_w_dates <- consensus_seqs[names(consensus_seqs) %in% date_info$central_sample_id]
ape::write.FASTA(consensus_seqs_w_dates, file = "./data/consensus/combined_consensus_replicates_filt_dates.fa")
fwrite(date_info, file = "./data/consensus/combined_consensus_replicates_filt_dates.csv",
quote = FALSE, sep = ",", col.names = TRUE)
Create the necessary multiple sequence alignment.
mafft --auto --thread -1 --keeplength --addfragments combined_consensus_replicates_filt_dates.fa nCoV-2019.reference.fasta > MA_combined_consensus_replicates_filt_dates.fa
temp_msa <- ape::read.dna("./data/consensus/MA_combined_consensus_replicates_filt_dates.fa",
format = "fasta")
temp_msa <- temp_msa[2:nrow(temp_msa), ]
ape::write.FASTA(temp_msa, file = "./data/consensus/MA_combined_consensus_replicates_filt_dates.fa")
Run the pairsnp and transcluster algorithms. These packages are available at https://github.com/gtonkinhill/pairsnp-cpp and https://github.com/gtonkinhill/fasttranscluster respectively.
python ~/fasttranscluster/fasttranscluster-runner.py --msa MA_combined_consensus_replicates_filt_dates.fa --dates combined_consensus_replicates_filt_dates.csv --save_probs --snp_threshold 10 -K 20 -o ./
Load results and filter out repeated samples from the same patient.
trans_probs <- fread("./data/consensus/transcluster_probabilities.csv", data.table = FALSE) %>%
as_tibble()
# remove repeated samples
trans_probs <- trans_probs %>% filter(!sampleA %in% repeated_samples) %>% filter(!sampleB %in%
repeated_samples)
Plot with possible mixed infections included.
temp <- shearwater_complex_calls %>% filter(!sampleID %in% repeated_samples) %>%
filter(r1_vaf < 0.99 & r2_vaf < 0.99)
high_prev <- names(table(paste(temp$pos, temp$mut)))[table(paste(temp$pos, temp$mut)) >
5]
temp$mut_pos <- paste(temp$pos, temp$mut)
temp$maf <- temp$vaf
temp$maf[temp$vaf > 0.5] <- temp$vaf_ref[temp$vaf > 0.5]
temp <- temp %>% filter(!mut_pos %in% high_prev)
complex_calls <- split(temp, temp$sampleID)
trans_vs_poly <- map_dfr(c(0.01, 0.02, 0.05, 0.1), function(min_maf) {
trans_probs$shared_poly <- map_dbl(1:nrow(trans_probs), function(i) {
if (!trans_probs$sampleA[[i]] %in% names(complex_calls))
return(0)
if (!trans_probs$sampleB[[i]] %in% names(complex_calls))
return(0)
mafA <- complex_calls[[trans_probs$sampleA[[i]]]] %>% filter(maf > min_maf)
mafB <- complex_calls[[trans_probs$sampleB[[i]]]] %>% filter(maf > min_maf)
return(sum(mafA$mut_pos %in% mafB$mut_pos))
})
trans_probs$min_maf <- min_maf
return(trans_probs)
})
trans_vs_poly$`probability direct transmission` <- cut(exp(trans_vs_poly$`0`), breaks = seq(0,
0.4, 0.01))
temp <- trans_vs_poly %>% filter(min_maf == 0.1) %>% filter(exp(`0`) > 0.18) %>%
filter(shared_poly > 0)
length(unique(c(temp$sampleA, temp$sampleB)))
## [1] 6
trans_vs_poly <- trans_vs_poly %>% group_by(min_maf, `probability direct transmission`) %>%
summarise(mean_shared_poly = mean(shared_poly))
colnames(trans_vs_poly) <- c("MAF", "probability direct transmission", "mean shared minor variants")
trans_vs_poly$MAF <- paste("minimum MAF:", trans_vs_poly$MAF)
ggplot(trans_vs_poly, aes(x = `probability direct transmission`, y = `mean shared minor variants`)) +
geom_col() + facet_wrap(~MAF, ncol = 1) + theme_bw(base_size = 16) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + ylab("mean pairwise shared variants")
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf_w_mix.pdf",
width = 9, height = 7)
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf_w_mix.png",
width = 9, height = 7)
plot with possible mixtures excluded
trans_probs <- fread("./data/consensus/transcluster_probabilities.csv", data.table = FALSE) %>%
as_tibble()
# remove repeated samples
trans_probs <- trans_probs %>% filter(!sampleA %in% repeated_samples) %>% filter(!sampleB %in%
repeated_samples)
# remove samples that are potentially mixed
mixed_samples <- mixture_tests_mm$sampleA
trans_probs <- trans_probs %>% filter(!sampleA %in% mixed_samples) %>% filter(!sampleB %in%
mixed_samples)
trans_vs_poly <- map_dfr(c(0.01, 0.02, 0.05, 0.1), function(min_maf) {
trans_probs$shared_poly <- map_dbl(1:nrow(trans_probs), function(i) {
if (!trans_probs$sampleA[[i]] %in% names(complex_calls))
return(0)
if (!trans_probs$sampleB[[i]] %in% names(complex_calls))
return(0)
mafA <- complex_calls[[trans_probs$sampleA[[i]]]] %>% filter(maf > min_maf)
mafB <- complex_calls[[trans_probs$sampleB[[i]]]] %>% filter(maf > min_maf)
return(sum(mafA$mut_pos %in% mafB$mut_pos))
})
trans_probs$min_maf <- min_maf
return(trans_probs)
})
trans_vs_poly$`probability direct transmission` <- cut(exp(trans_vs_poly$`0`), breaks = seq(0,
0.4, 0.01))
temp <- trans_vs_poly %>% filter(min_maf == 0.1) %>% filter(exp(`0`) > 0.18) %>%
filter(shared_poly > 0)
length(unique(c(temp$sampleA, temp$sampleB)))
## [1] 6
trans_vs_poly <- trans_vs_poly %>% group_by(min_maf, `probability direct transmission`) %>%
summarise(mean_shared_poly = mean(shared_poly))
colnames(trans_vs_poly) <- c("MAF", "probability direct transmission", "mean shared minor variants")
trans_vs_poly$MAF <- paste("minimum MAF:", trans_vs_poly$MAF)
ggplot(trans_vs_poly, aes(x = `probability direct transmission`, y = `mean shared minor variants`)) +
# geom_point(size=5) +
geom_col() + facet_wrap(~MAF, ncol = 1) + theme_bw(base_size = 16) + theme(axis.text.x = element_text(angle = 45,
hjust = 1)) + ylab("mean pairwise shared variants")
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf.pdf", width = 9,
height = 7)
ggsave(filename = "./Figures/pairwise_transmission_prob_vs_multi_maf.png", width = 9,
height = 7)
To investigate the phylogentic signal present in the patterns of intra host varianst across the consensus phylogeny we generated binary presence/absence vectors to indicate which of the replicate pairs a minority variant occurred in. These were then used with the maximum likelihood phylogeny previously described to infer both the delta and D-statistic which indicates the concordance between the minority variant and the consensus phylogeny. The R package caper v0.2 was used to infer the D-statistic (Orme et al. 2013; Fritz and Purvis 2010). The delta statistic was calculated using the R code provided with the publication (Borges et al. 2019).
source("./delta_statistic.R")
samples_considered <- unique(run_consensus_df$sample)[!unique(run_consensus_df$sample) %in%
mixture_tests_mm$sampleA]
samples_considered <- c(samples_considered, "MN908947.3")
ml_tree <- read.tree("./data/consensus/elan.20200529.consensus.filt.mask.tree")
ml_tree <- drop.tip(ml_tree, ml_tree$tip.label[!ml_tree$tip.label %in% samples_considered])
ml_tree <- root(ml_tree, outgroup = "MN908947.3", resolve.root = TRUE)
ml_tree <- multi2di(ml_tree)
ml_tree$edge.length <- pmax(ml_tree$edge.length, 1e-10)
plan(multiprocess)
tb <- table(run_consensus_df$POS[run_consensus_df$sample %in% samples_considered])
delta_results <- furrr::future_map_dfr(names(tb)[tb > 2], ~{
tr <- ml_tree$tip.label %in% run_consensus_df$sample[run_consensus_df$POS ==
.x]
d <- delta(trait = tr, tree = ml_tree, lambda0 = 1, se = 1, sim = 10000, thin = 10,
burn = 5000)
tibble(POS = .x, delta = d)
}, .progress = TRUE)
##
Progress: ──── 100%
Progress: ───── 100%
Progress: ─────── 100%
Progress: ──────── 100%
Progress: ────────── 100%
Progress: ─────────── 100%
Progress: ──────────── 100%
Progress: ────────────── 100%
Progress: ──────────────── 100%
Progress: ──────────────── 100%
Progress: ──────────────── 100%
Progress: ────────────────── 100%
Progress: ──────────────────── 100%
Progress: ────────────────────── 100%
Progress: ────────────────────── 100%
Progress: ──────────────────────── 100%
Progress: ────────────────────────── 100%
Progress: ─────────────────────────── 100%
Progress: ──────────────────────────── 100%
Progress: ───────────────────────────── 100%
Progress: ───────────────────────────── 100%
Progress: ─────────────────────────────── 100%
Progress: ───────────────────────────────── 100%
Progress: ────────────────────────────────── 100%
Progress: ──────────────────────────────────── 100%
Progress: ───────────────────────────────────── 100%
Progress: ────────────────────────────────────── 100%
Progress: ─────────────────────────────────────── 100%
Progress: ───────────────────────────────────────── 100%
Progress: ────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
Progress: ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── 100%
delta_results$log1_delta <- log(delta_results$delta + 1)
delta_results <- delta_results %>% arrange(delta)
pos_counts <- run_consensus_df %>% group_by(POS) %>% summarise(count = n())
delta_results$count <- pos_counts$count[match(delta_results$POS, pos_counts$POS)]
ggplot(delta_results, aes(x = delta)) + geom_histogram() + theme_bw(base_size = 14)
ggsave(file = "./Figures/delta_histogram.png", width = 9, height = 7)
ggsave(file = "./Figures/delta_histogram.pdf", width = 9, height = 7)
write.csv(delta_results, file = "./Processed_data/delta_results.csv", quote = FALSE,
row.names = FALSE)
bindata <- map_dfc(names(tb)[tb > 2], ~{
ml_tree$tip.label %in% run_consensus_df$sample[run_consensus_df$POS == .x]
})
colnames(bindata) <- names(tb)[tb > 2]
bindata <- bindata * 1
bindata <- bindata %>% add_column(sample = ml_tree$tip.label, .before = 1)
bindata <- caper::comparative.data(ml_tree, bindata, sample)
saveRDS(bindata, "./Processed_data/bindata.RDS")
bindata <- readRDS("./Processed_data/bindata.RDS")
plan(multiprocess)
phylo.d <- function(data, phy, names.col, binvar, permut = 1000, rnd.bias = NULL) {
if (!missing(data)) {
if (!inherits(data, "comparative.data")) {
if (missing(names.col))
stop("names column is missing")
names.col <- deparse(substitute(names.col))
data <- caicStyleArgs(data = data, phy = phy, names.col = names.col)
}
}
# binvar <- deparse(substitute(binvar))
bininds <- match(binvar, names(data$data))
if (is.na(bininds))
(stop("'", binvar, "' is not a variable in data."))
ds <- data$data[, bininds]
if (any(is.na(ds)))
stop("'", binvar, "' contains missing values.")
if (is.character(ds))
ds <- as.factor(ds)
if (length(unique(ds)) > 2)
stop("'", binvar, "' contains more than two states.")
if (length(unique(ds)) < 2)
stop("'", binvar, "' only contains a single state.")
propStates <- unclass(table(ds))
propState1 <- propStates[1]/sum(propStates)
names(dimnames(propStates)) <- binvar
if (is.factor(ds))
ds <- as.numeric(ds)
if (!is.numeric(permut))
(stop("'", permut, "' is not numeric."))
if (!is.null(rnd.bias)) {
rnd.bias <- deparse(substitute(rnd.bias))
rnd.ind <- match(rnd.bias, names(data$data))
if (is.na(rnd.ind))
(stop("'", rnd.bias, "' is not a variable in data."))
rnd.bias <- data$data[, rnd.bias]
}
el <- data$phy$edge.length
elTip <- data$phy$edge[, 2] <= length(data$phy$tip.label)
if (any(el[elTip] == 0))
stop("Phylogeny contains pairs of tips on zero branch lengths, cannot currently simulate")
if (any(el[!elTip] == 0))
stop("Phylogeny contains zero length internal branches. Use di2multi.")
ds.ran <- replicate(permut, sample(ds, prob = rnd.bias))
if (is.null(data$vcv)) {
vcv <- VCV.array(data$phy)
} else {
vcv <- data$vcv
}
ds.phy <- rmvnorm(permut, sigma = unclass(vcv))
ds.phy <- as.data.frame(t(ds.phy))
ds.phy.thresh <- apply(ds.phy, 2, quantile, propState1)
ds.phy <- sweep(ds.phy, 2, ds.phy.thresh, "<")
ds.phy <- as.numeric(ds.phy)
dim(ds.phy) <- dim(ds.ran)
ds.ran <- cbind(Obs = ds, ds.ran)
ds.phy <- cbind(Obs = ds, ds.phy)
dimnames(ds.ran) <- dimnames(ds.phy) <- list(data$phy$tip.label, c("Obs", paste("V",
1:permut, sep = "")))
phy <- reorder(data$phy, "pruningwise")
ds.ran.cc <- contrCalc(vals = ds.ran, phy = phy, ref.var = "V1", picMethod = "phylo.d",
crunch.brlen = 0)
ds.phy.cc <- contrCalc(vals = ds.phy, phy = phy, ref.var = "V1", picMethod = "phylo.d",
crunch.brlen = 0)
ransocc <- colSums(ds.ran.cc$contrMat)
physocc <- colSums(ds.phy.cc$contrMat)
if (round(ransocc[1], digits = 6) != round(physocc[1], digits = 6))
stop("Problem with character change calculation in phylo.d")
obssocc <- ransocc[1]
ransocc <- ransocc[-1]
physocc <- physocc[-1]
soccratio <- (obssocc - mean(physocc))/(mean(ransocc) - mean(physocc))
soccpval1 <- sum(ransocc < obssocc)/permut
soccpval0 <- sum(physocc > obssocc)/permut
dvals <- list(DEstimate = soccratio, Pval1 = soccpval1, Pval0 = soccpval0, Parameters = list(Observed = obssocc,
MeanRandom = mean(ransocc), MeanBrownian = mean(physocc)), StatesTable = propStates,
Permutations = list(random = ransocc, brownian = physocc), NodalVals = list(observed = ds.ran.cc$nodVal[,
1, drop = FALSE], random = ds.ran.cc$nodVal[, -1, drop = FALSE], brownian = ds.phy.cc$nodVal[,
-1, drop = FALSE]), binvar = binvar, data = data, nPermut = permut, rnd.bias = rnd.bias)
class(dvals) <- "phylo.d"
return(dvals)
}
Dstat_results <- furrr::future_map_dfr(colnames(bindata$data), ~{
ds <- phylo.d(bindata, ml_tree, binvar = .x)
tibble(pos = .x, D_stat = ds$DEstimate, prob_rand = ds$Pval1, prob_brow = ds$Pval0)
})
write.csv(Dstat_results, "./Processed_data/Dstat_results.csv", row.names = FALSE,
quote = FALSE)
combine into table
Dstat_results <- fread("./Processed_data/Dstat_results.csv", data.table = FALSE) %>%
as_tibble()
concordance_stats <- merge(delta_results, Dstat_results, by.x = "POS", by.y = "pos",
all = TRUE) %>% as_tibble()
concordance_stats$log1_delta <- NULL
colnames(concordance_stats) <- c("position", "delta", "MAF count", "D statistic",
"D probability random", "D probability brownian")
concordance_stats <- concordance_stats[, c("position", "delta", "D statistic", "D probability random",
"D probability brownian", "MAF count")]
write.csv(concordance_stats, file = "Processed_data/concordance_statistics.csv",
quote = FALSE, row.names = FALSE)
knitr::kable(concordance_stats)
| position | delta | D statistic | D probability random | D probability brownian | MAF count |
|---|---|---|---|---|---|
| 10029 | 922.169005 | 0.2889918 | 0.034 | 0.160 | 5 |
| 10076 | 906.579608 | 0.7788818 | 0.435 | 0.019 | 3 |
| 10252 | 34.238295 | 2.0590863 | 0.934 | 0.000 | 4 |
| 10323 | 20.060643 | 0.6586141 | 0.182 | 0.006 | 7 |
| 10369 | 33.919858 | 0.7769929 | 0.333 | 0.005 | 6 |
| 10507 | 895.803689 | 2.0701430 | 0.961 | 0.000 | 4 |
| 106 | 47.056203 | 1.8846633 | 0.908 | 0.002 | 3 |
| 10626 | 31.250973 | 1.3354581 | 0.778 | 0.000 | 5 |
| 10702 | 16.346129 | 0.6890390 | 0.358 | 0.033 | 3 |
| 10755 | 904.974023 | 0.9468429 | 0.544 | 0.009 | 3 |
| 10986 | 41.850203 | 1.4081788 | 0.800 | 0.000 | 4 |
| 11071 | 460.150452 | 0.9670275 | 0.531 | 0.001 | 4 |
| 11074 | 5.806614 | 0.7854052 | 0.131 | 0.000 | 23 |
| 1108 | 916.869713 | -0.9171246 | 0.000 | 0.999 | 5 |
| 11083 | 4.219420 | 0.8523321 | 0.112 | 0.000 | 54 |
| 11152 | 904.226865 | 1.6681246 | 0.845 | 0.001 | 3 |
| 11224 | 39.781636 | 1.1497168 | 0.668 | 0.002 | 4 |
| 11595 | 26.138538 | 1.4058260 | 0.778 | 0.001 | 3 |
| 12164 | 7.236215 | 1.1097504 | 0.653 | 0.000 | 7 |
| 12396 | 460.362242 | 1.5772495 | 0.833 | 0.002 | 3 |
| 12578 | 27.816617 | 1.0514094 | 0.599 | 0.000 | 8 |
| 12789 | 903.274064 | 1.0454830 | 0.606 | 0.004 | 3 |
| 1288 | 24.590686 | 0.5640706 | 0.272 | 0.064 | 3 |
| 13458 | 46.382286 | 1.7416906 | 0.875 | 0.002 | 4 |
| 13571 | 8.558075 | 0.9191850 | 0.415 | 0.000 | 13 |
| 13780 | 14.429531 | 0.6191912 | 0.025 | 0.000 | 18 |
| 14183 | 35.696348 | 0.5053818 | 0.221 | 0.094 | 3 |
| 14277 | 46.913249 | 0.7378212 | 0.394 | 0.029 | 3 |
| 14411 | 23.552494 | 3.3055198 | 0.999 | 0.000 | 4 |
| 14599 | 13.458579 | 1.7430929 | 0.885 | 0.001 | 3 |
| 14790 | 21.720046 | 1.3391999 | 0.806 | 0.001 | 5 |
| 14805 | 31.537316 | 1.3359925 | 0.760 | 0.000 | 15 |
| 14928 | 896.233160 | -0.9962628 | 0.000 | 1.000 | 4 |
| 15004 | 47.252572 | 1.2024500 | 0.696 | 0.004 | 3 |
| 1547 | 7.509224 | 0.8817779 | 0.220 | 0.000 | 35 |
| 15720 | 24.698824 | 0.6660716 | 0.287 | 0.029 | 4 |
| 15738 | 15.502143 | 0.4457783 | 0.196 | 0.101 | 3 |
| 15842 | 897.731312 | 0.4683315 | 0.161 | 0.080 | 4 |
| 16111 | 38.020458 | 0.1378600 | 0.055 | 0.352 | 3 |
| 16332 | 894.854330 | 1.9797325 | 0.919 | 0.000 | 3 |
| 16375 | 17.062077 | 1.2148665 | 0.823 | 0.000 | 13 |
| 16393 | 713.642300 | 0.9375319 | 0.522 | 0.003 | 4 |
| 16438 | 918.727623 | 0.8064141 | 0.375 | 0.003 | 5 |
| 16466 | 17.949630 | 1.2311658 | 0.815 | 0.000 | 14 |
| 16474 | 29.185294 | 1.0100562 | 0.567 | 0.002 | 5 |
| 16887 | 21.766963 | 0.4905783 | 0.008 | 0.012 | 13 |
| 16949 | 31.978245 | 0.8331507 | 0.388 | 0.005 | 6 |
| 17012 | 31.089600 | 0.8861878 | 0.500 | 0.009 | 3 |
| 17013 | 338.063868 | 0.7563616 | 0.330 | 0.010 | 6 |
| 17074 | 18.515522 | 0.9220798 | 0.476 | 0.002 | 5 |
| 17167 | 22.446415 | 0.8193273 | 0.282 | 0.001 | 13 |
| 17252 | 927.354404 | 0.9768491 | 0.555 | 0.008 | 3 |
| 17321 | 910.678323 | 0.9760400 | 0.556 | 0.012 | 3 |
| 17410 | 921.098672 | 0.5210627 | 0.163 | 0.058 | 4 |
| 17440 | 46.965205 | 1.4276433 | 0.778 | 0.003 | 3 |
| 17456 | 23.668454 | 2.0450311 | 0.922 | 0.000 | 4 |
| 17550 | 34.566605 | 0.8324486 | 0.414 | 0.008 | 5 |
| 17676 | 40.726470 | 1.2842821 | 0.749 | 0.000 | 5 |
| 17678 | 908.614841 | 0.2315699 | 0.071 | 0.266 | 3 |
| 17731 | 610.022560 | -0.0562745 | 0.015 | 0.584 | 3 |
| 17795 | 19.524504 | 1.1932256 | 0.686 | 0.001 | 4 |
| 18028 | 913.610410 | 1.0682968 | 0.616 | 0.006 | 3 |
| 18555 | 899.060274 | 1.5076961 | 0.809 | 0.000 | 4 |
| 186 | 37.215302 | 0.4626009 | 0.123 | 0.065 | 5 |
| 18744 | 912.800721 | 0.9844032 | 0.571 | 0.007 | 3 |
| 18877 | 35.577666 | 0.8819918 | 0.473 | 0.007 | 4 |
| 1912 | 904.698722 | 0.9902185 | 0.537 | 0.000 | 6 |
| 19269 | 916.020672 | 1.6826850 | 0.890 | 0.000 | 4 |
| 19679 | 39.523938 | 0.8464676 | 0.473 | 0.022 | 3 |
| 203 | 8.272362 | 1.0469806 | 0.606 | 0.000 | 15 |
| 20578 | 47.086819 | 0.7468074 | 0.391 | 0.030 | 4 |
| 20627 | 28.441100 | 1.1606003 | 0.676 | 0.002 | 4 |
| 20759 | 18.007787 | 1.0343056 | 0.579 | 0.004 | 4 |
| 20762 | 903.303327 | 0.8766545 | 0.487 | 0.014 | 3 |
| 20922 | 16.868402 | 1.0618966 | 0.616 | 0.004 | 4 |
| 20925 | 52.493699 | 1.1190603 | 0.637 | 0.006 | 3 |
| 21137 | 21.098065 | 1.1067612 | 0.691 | 0.000 | 12 |
| 2144 | 55.981400 | 1.5891500 | 0.828 | 0.001 | 3 |
| 21575 | 21.401315 | 1.6091362 | 0.979 | 0.000 | 15 |
| 21622 | 34.546693 | 0.6739179 | 0.355 | 0.047 | 3 |
| 21707 | 912.832652 | -0.1996671 | 0.007 | 0.708 | 3 |
| 21752 | 45.751451 | 0.7586685 | 0.413 | 0.034 | 3 |
| 2189 | 43.294889 | 1.8617404 | 0.904 | 0.000 | 3 |
| 22104 | 39.590112 | 0.8421468 | 0.475 | 0.015 | 3 |
| 222 | 904.011964 | 0.7765236 | 0.441 | 0.024 | 3 |
| 22899 | 23.526642 | 1.0334498 | 0.574 | 0.000 | 9 |
| 23029 | 25.322181 | 1.3845725 | 0.761 | 0.001 | 3 |
| 23161 | 48.367080 | 0.5947332 | 0.289 | 0.043 | 3 |
| 23189 | 42.483799 | 0.6016069 | 0.308 | 0.054 | 3 |
| 23191 | 643.528912 | -0.0226213 | 0.018 | 0.561 | 3 |
| 23202 | 31.727734 | 0.7876710 | 0.390 | 0.015 | 4 |
| 23242 | 47.130087 | 1.6626637 | 0.863 | 0.001 | 3 |
| 23243 | 46.565709 | 0.9617360 | 0.557 | 0.011 | 3 |
| 23248 | 23.210448 | 0.9885911 | 0.550 | 0.004 | 4 |
| 23271 | 46.933662 | 0.8391196 | 0.478 | 0.027 | 3 |
| 23272 | 47.137930 | 0.4058863 | 0.177 | 0.144 | 3 |
| 23277 | 26.954562 | 0.3524829 | 0.096 | 0.122 | 4 |
| 23278 | 10.824628 | 0.8523076 | 0.453 | 0.003 | 4 |
| 23280 | 38.010677 | 0.6663094 | 0.267 | 0.011 | 5 |
| 23282 | 46.276262 | 0.9441398 | 0.524 | 0.011 | 4 |
| 23285 | 46.655767 | 0.9228932 | 0.516 | 0.011 | 3 |
| 23310 | 47.066340 | 0.7207019 | 0.377 | 0.032 | 3 |
| 23311 | 907.275661 | 0.5264970 | 0.197 | 0.061 | 4 |
| 23435 | 18.031532 | 1.0004395 | 0.555 | 0.000 | 6 |
| 23525 | 28.639411 | 0.4442847 | 0.100 | 0.063 | 6 |
| 23868 | 38.216322 | 1.5100397 | 0.807 | 0.001 | 3 |
| 23987 | 15.171188 | 0.2667905 | 0.050 | 0.213 | 4 |
| 241 | 47.202898 | 2.9034475 | 0.981 | 0.000 | 16 |
| 24138 | 611.683294 | 0.6066293 | 0.312 | 0.039 | 3 |
| 24213 | 10.977988 | 1.2727473 | 0.830 | 0.000 | 11 |
| 24381 | 900.006527 | 0.6823688 | 0.348 | 0.032 | 3 |
| 2453 | 903.310090 | -0.9881991 | 0.000 | 1.000 | 4 |
| 24781 | 909.726101 | -0.8832196 | 0.000 | 0.969 | 3 |
| 24899 | 14.111114 | 0.6006265 | 0.287 | 0.068 | 3 |
| 24933 | 908.956592 | 0.8644515 | 0.366 | 0.000 | 9 |
| 25000 | 35.308871 | 1.1283243 | 0.659 | 0.000 | 5 |
| 25036 | 918.931021 | 0.6163239 | 0.316 | 0.052 | 3 |
| 25096 | 904.548738 | 0.2635194 | 0.091 | 0.221 | 3 |
| 25169 | 916.284864 | 2.2396352 | 0.951 | 0.000 | 5 |
| 25521 | 8.893667 | 1.2062823 | 0.830 | 0.000 | 20 |
| 25572 | 34.810932 | 1.3631463 | 0.776 | 0.003 | 3 |
| 25613 | 917.232161 | 1.3739975 | 0.756 | 0.001 | 3 |
| 25728 | 47.665211 | 1.8599714 | 0.893 | 0.000 | 3 |
| 25889 | 19.500558 | 1.3352943 | 0.795 | 0.001 | 4 |
| 25996 | 919.345814 | 1.7056470 | 0.860 | 0.003 | 3 |
| 26137 | 6.748433 | 0.8627953 | 0.113 | 0.000 | 50 |
| 26270 | 22.773422 | 1.8003898 | 0.916 | 0.000 | 4 |
| 26333 | 8.554091 | 0.9343625 | 0.485 | 0.000 | 7 |
| 26408 | 47.217866 | 1.5946631 | 0.843 | 0.000 | 3 |
| 26526 | 903.180683 | 0.2312238 | 0.092 | 0.266 | 3 |
| 26533 | 913.523868 | 1.4092581 | 0.769 | 0.001 | 4 |
| 26681 | 38.378944 | 1.0019230 | 0.541 | 0.000 | 7 |
| 26735 | 904.511829 | 1.7548477 | 0.867 | 0.000 | 3 |
| 26768 | 36.819742 | 2.2141653 | 0.976 | 0.000 | 6 |
| 26801 | 20.945421 | 0.7359306 | 0.309 | 0.006 | 5 |
| 26858 | 46.569481 | 0.4650056 | 0.198 | 0.110 | 4 |
| 26895 | 536.050587 | 0.6203520 | 0.266 | 0.033 | 5 |
| 26927 | 21.217970 | 0.7373354 | 0.397 | 0.030 | 3 |
| 27213 | 42.044558 | 0.7905364 | 0.383 | 0.012 | 4 |
| 27297 | 46.620802 | 0.8961628 | 0.497 | 0.005 | 4 |
| 27384 | 20.143295 | 1.3463073 | 0.810 | 0.000 | 7 |
| 27390 | 52.982767 | 0.2729931 | 0.107 | 0.236 | 3 |
| 27434 | 12.462871 | 0.7759181 | 0.434 | 0.025 | 3 |
| 27476 | 16.767318 | 0.7926654 | 0.436 | 0.025 | 3 |
| 27493 | 890.270021 | 1.2159817 | 0.706 | 0.003 | 3 |
| 27876 | 917.066349 | -0.2164479 | 0.008 | 0.739 | 3 |
| 27877 | 916.835971 | -0.2378501 | 0.009 | 0.760 | 3 |
| 27920 | 40.861877 | 0.6959937 | 0.319 | 0.024 | 5 |
| 27925 | 20.992935 | 1.0727282 | 0.620 | 0.000 | 6 |
| 28001 | 32.363236 | 0.2000732 | 0.084 | 0.322 | 3 |
| 28077 | 913.168074 | 2.1438350 | 0.937 | 0.000 | 3 |
| 28079 | 16.366369 | 0.6299436 | 0.150 | 0.010 | 7 |
| 28086 | 46.759769 | 1.2421113 | 0.719 | 0.000 | 3 |
| 28087 | 24.592351 | 0.8613318 | 0.443 | 0.006 | 4 |
| 28093 | 901.509063 | 2.3999901 | 0.960 | 0.000 | 3 |
| 28253 | 7.986339 | 1.1458794 | 0.885 | 0.000 | 52 |
| 28310 | 915.710318 | 2.1325025 | 0.944 | 0.000 | 3 |
| 28377 | 906.871995 | 1.0835206 | 0.619 | 0.006 | 4 |
| 28603 | 26.379759 | 1.0751980 | 0.632 | 0.000 | 10 |
| 28744 | 589.794269 | 0.0391932 | 0.011 | 0.482 | 4 |
| 28770 | 59.025294 | 0.0593732 | 0.034 | 0.450 | 3 |
| 28826 | 10.871870 | 0.1225743 | 0.000 | 0.362 | 8 |
| 28854 | 33.657232 | 1.1900244 | 0.693 | 0.001 | 5 |
| 28881 | 17.365026 | 1.2870147 | 0.775 | 0.000 | 16 |
| 28882 | 17.087211 | 1.3175207 | 0.771 | 0.000 | 15 |
| 28883 | 17.080275 | 1.3356146 | 0.785 | 0.000 | 17 |
| 28887 | 9.251802 | 0.9709223 | 0.517 | 0.000 | 8 |
| 29041 | 908.196895 | 0.8351584 | 0.450 | 0.021 | 3 |
| 29095 | 895.684147 | 1.2506875 | 0.693 | 0.005 | 3 |
| 29160 | 908.532255 | 0.6119807 | 0.315 | 0.047 | 3 |
| 29200 | 46.974102 | 2.1946723 | 0.934 | 0.000 | 3 |
| 29242 | 911.164435 | 0.9815830 | 0.496 | 0.000 | 9 |
| 29247 | 17.546656 | 2.6387378 | 0.969 | 0.000 | 3 |
| 29253 | 9.160738 | 0.9487779 | 0.511 | 0.006 | 4 |
| 29272 | 511.342743 | 0.3832750 | 0.168 | 0.152 | 3 |
| 29274 | 46.937769 | 0.9036224 | 0.506 | 0.007 | 3 |
| 29311 | 41.476842 | 0.4627240 | 0.134 | 0.065 | 5 |
| 29421 | 521.195248 | 0.3205215 | 0.125 | 0.186 | 3 |
| 29543 | 47.122681 | 0.7796417 | 0.422 | 0.034 | 3 |
| 29555 | 24.481053 | 1.4674829 | 0.799 | 0.002 | 3 |
| 29635 | 907.365292 | 1.4327927 | 0.818 | 0.000 | 4 |
| 29640 | 915.174925 | 0.3231262 | 0.124 | 0.190 | 3 |
| 29642 | 905.396492 | 2.2900685 | 0.942 | 0.000 | 3 |
| 29686 | 909.991881 | 2.0812054 | 0.930 | 0.000 | 3 |
| 29700 | 46.544495 | 0.8482502 | 0.468 | 0.017 | 3 |
| 29730 | 28.443528 | 0.2773870 | 0.045 | 0.181 | 4 |
| 29742 | 914.323475 | 1.0001127 | 0.585 | 0.010 | 3 |
| 29743 | 440.737131 | 0.5434769 | 0.209 | 0.050 | 5 |
| 29750 | 897.004770 | 1.6027514 | 0.851 | 0.000 | 4 |
| 29754 | 10.051708 | 0.9748936 | 0.534 | 0.002 | 5 |
| 29774 | 31.463890 | 0.7765144 | 0.379 | 0.015 | 4 |
| 29779 | 13.724440 | 1.0916799 | 0.642 | 0.002 | 3 |
| 29781 | 904.168434 | 1.0112811 | 0.585 | 0.006 | 4 |
| 3096 | 7.861504 | 1.1572549 | 0.766 | 0.000 | 15 |
| 313 | 904.352748 | 0.2976671 | 0.125 | 0.210 | 3 |
| 3176 | 914.390074 | 1.7797672 | 0.881 | 0.000 | 3 |
| 3177 | 378.277723 | 0.5786564 | 0.241 | 0.041 | 4 |
| 3259 | 924.715684 | 0.8467935 | 0.443 | 0.010 | 5 |
| 337 | 18.734333 | 0.4466009 | 0.199 | 0.118 | 3 |
| 3425 | 576.759090 | -0.2167123 | 0.000 | 0.761 | 3 |
| 346 | 33.331726 | 1.8512194 | 0.922 | 0.000 | 5 |
| 3564 | 45.186188 | 0.9911267 | 0.559 | 0.015 | 4 |
| 3877 | 14.804358 | 0.4382944 | 0.093 | 0.059 | 5 |
| 4399 | 899.162216 | 0.7977106 | 0.446 | 0.021 | 3 |
| 4505 | 46.974473 | 0.7738092 | 0.429 | 0.026 | 3 |
| 462 | 907.884544 | 0.2866699 | 0.051 | 0.196 | 4 |
| 4815 | 47.145122 | 0.5622523 | 0.271 | 0.060 | 3 |
| 558 | 25.641086 | 0.8661731 | 0.366 | 0.000 | 9 |
| 5622 | 919.703200 | -0.1710916 | 0.011 | 0.692 | 3 |
| 5812 | 26.006750 | 0.6199765 | 0.318 | 0.055 | 3 |
| 5826 | 917.320630 | 1.2414954 | 0.712 | 0.003 | 3 |
| 5869 | 903.658552 | 1.0885543 | 0.648 | 0.004 | 3 |
| 6121 | 908.128294 | 2.0983190 | 0.935 | 0.000 | 3 |
| 6286 | 47.381993 | 0.5391345 | 0.253 | 0.063 | 3 |
| 6317 | 911.779172 | 0.6748436 | 0.315 | 0.015 | 4 |
| 635 | 4.442886 | 0.9107739 | 0.259 | 0.000 | 42 |
| 643 | 20.875013 | 1.8036783 | 0.887 | 0.001 | 3 |
| 6513 | 46.928088 | 0.7790056 | 0.439 | 0.023 | 3 |
| 6539 | 42.690776 | 1.8057689 | 0.911 | 0.000 | 4 |
| 683 | 8.060202 | 1.0364621 | 0.610 | 0.000 | 28 |
| 7420 | 896.710792 | 1.5965251 | 0.829 | 0.001 | 3 |
| 745 | 902.287980 | 0.5546772 | 0.271 | 0.082 | 3 |
| 7504 | 890.069769 | -0.0471217 | 0.015 | 0.577 | 3 |
| 7528 | 535.641122 | 0.1986550 | 0.031 | 0.304 | 5 |
| 7735 | 11.247375 | 1.3358091 | 0.794 | 0.001 | 6 |
| 7765 | 16.723721 | 0.8295135 | 0.479 | 0.014 | 3 |
| 7768 | 928.637960 | 0.7158644 | 0.378 | 0.036 | 3 |
| 7851 | 905.646368 | 0.5982510 | 0.305 | 0.054 | 3 |
| 8318 | 42.816698 | 0.6469713 | 0.282 | 0.028 | 4 |
| 835 | 902.733849 | 0.7618534 | 0.407 | 0.026 | 3 |
| 851 | 900.828356 | 1.0909988 | 0.618 | 0.002 | 3 |
| 9052 | 53.114848 | 0.3836342 | 0.158 | 0.159 | 3 |
| 9165 | 30.831698 | 1.1215850 | 0.659 | 0.000 | 7 |
| 9180 | 37.422018 | 0.7114883 | 0.293 | 0.008 | 5 |
| 9286 | 46.888588 | 0.5611465 | 0.261 | 0.057 | 3 |
| 929 | 911.194296 | 0.8256940 | 0.461 | 0.021 | 3 |
| 9430 | 24.532582 | 0.9120408 | 0.395 | 0.000 | 13 |
| 9438 | 16.785460 | 1.4306946 | 0.949 | 0.000 | 15 |
| 9474 | 14.120403 | 0.7631943 | 0.257 | 0.001 | 8 |
| 9479 | 16.331890 | 0.5122737 | 0.077 | 0.018 | 7 |
| 9491 | 898.542753 | 0.8109986 | 0.198 | 0.000 | 18 |
| 9519 | 924.559804 | 1.6478497 | 0.847 | 0.000 | 3 |
| 9561 | 44.939676 | 2.1842099 | 0.953 | 0.000 | 3 |
| 9603 | 918.395384 | 1.1234314 | 0.634 | 0.006 | 3 |
| 9629 | 47.469329 | 1.9568510 | 0.906 | 0.000 | 3 |
| 9634 | 912.989998 | 1.1172785 | 0.640 | 0.005 | 3 |
| 9737 | 912.922522 | 0.7948590 | 0.358 | 0.006 | 5 |
| 9741 | 905.073732 | 1.5025152 | 0.806 | 0.002 | 3 |
| 9848 | 25.774409 | 0.0509467 | 0.001 | 0.471 | 5 |
| 9866 | 50.138510 | 0.3719184 | 0.099 | 0.130 | 4 |
| 9924 | 919.539635 | 0.9524293 | 0.543 | 0.009 | 3 |
To investigate the possible accumulation of de novo mutations during the course of an infection, we studied 43 individuals for whom we had multiple samples collected at different timepoints. Initially, we collect the VAFs inferred using the shearwater algorithm. Unlike in much of the analysis above, we allow for more complex variants rather than just single nucleotude polymorphisms.
MINMAF <- 0.01
MINDEPTH <- 1000
load("./data/allele_counts_allsites_allsamples.rda")
rownames(countsall) <- fread("./data/sample_list_1181.tsv", header = FALSE, data.table = FALSE)$V1
colnames(countsall) <- 1:ncol(countsall)
dimnames(countsall)[[3]] <- c("A", "T", "C", "G", "-", "INS")
multi_run_calls <- shearwater_complex_calls %>% filter(r1_vaf < 1 & r2_vaf < 1) %>%
filter(sampleID %in% unlist(meta_multi$samples))
multi_run_calls$host <- replicate_meta$biosample_source_id[match(multi_run_calls$sampleID,
replicate_meta$central_sample_id)]
multi_run_calls$date <- replicate_meta$collection_date[match(multi_run_calls$sampleID,
replicate_meta$central_sample_id)]
multi_run_calls <- multi_run_calls %>% filter(date != "None")
temp <- table(multi_run_calls$host[!duplicated(multi_run_calls$sampleID)])
multi_run_calls <- multi_run_calls %>% filter(!multi_run_calls$host %in% names(temp)[temp <
2])
multi_run_calls$date <- as_date(multi_run_calls$date)
multi_run_calls$mut_simple <- multi_run_calls$mut
multi_run_calls$mut_simple[!multi_run_calls$mut_simple %in% c("-", "T", "G", "C",
"A", "INS")] <- map_chr(multi_run_calls$mut_simple[!multi_run_calls$mut_simple %in%
c("-", "T", "G", "C", "A", "INS")], ~{
substr(.x, 1, 1)
})
multi_run_calls_recovered <- multi_run_calls %>% group_by(host) %>% group_map(~{
samples <- unique(.x$sampleID)
nsamples <- length(samples)
mis <- table(paste(.x$mut_simple, .x$pos))
mis <- .x[paste(.x$mut_simple, .x$pos) %in% names(mis)[mis != nsamples], ]
missing <- map_dfr(1:nrow(mis), function(i) {
missing_samples <- unique(samples[!samples %in% mis$sampleID[(mis$pos ==
mis$pos[[i]]) & (mis$mut_simple[[i]] == mis$mut_simple)]])
vafs <- map_dfr(missing_samples, function(s) {
vaf <- countsall[s, mis$pos[[i]], mis$mut_simple[[i]]]/sum(countsall[s,
mis$pos[[i]], ])
countsall["CAMB-76937", 6855, "T"]
sum(countsall["CAMB-76937", 6855, ])
if (vaf > 0.001) {
temp <- mis[i, ]
temp$sampleID <- s
temp$vaf <- vaf
temp$date <- unique(replicate_meta$collection_date[replicate_meta$central_sample_id ==
s])
return(temp)
} else {
return(tibble())
}
})
return(vafs)
})
return(missing)
}, keep = TRUE)
multi_run_calls_recovered <- do.call(rbind, multi_run_calls_recovered)
multi_run_calls_recovered <- multi_run_calls_recovered[!duplicated(multi_run_calls_recovered[,
c("sampleID", "pos", "mut")]), ]
multi_run_calls_recovered <- rbind(multi_run_calls, multi_run_calls_recovered)
all_consensus <- multi_run_calls_recovered %>% group_by(pos, mut, host) %>% summarise(allcons = all(vaf >
0.75)) %>% filter(allcons)
all_consensus$combined <- sprintf("%s %s %s", all_consensus$pos, all_consensus$mut,
all_consensus$host)
multi_run_calls_recovered <- multi_run_calls_recovered %>% filter(!paste(pos, mut,
host) %in% all_consensus$combined)
Generate time series plots of the inferred VAFs within each patient. Multiple samples on the same day are ‘jittered’ to enable the variation within a day to be observed.
all_meta <- fread("./data/majora.20200501.metadata.tsv") %>% as_tibble() %>% filter(central_sample_id %in%
multi_run_calls_recovered$sampleID)
multi_run_calls_recovered$swab_site <- all_meta$swab_site[match(multi_run_calls_recovered$sampleID,
all_meta$central_sample_id)]
multi_run_calls_recovered$sample_type_collected <- all_meta$sample_type_collected[match(multi_run_calls_recovered$sampleID,
all_meta$central_sample_id)]
plotdf <- multi_run_calls_recovered
plotdf$mut_pos <- paste(plotdf$mut_simple, plotdf$pos)
hostdaycount <- table(paste(plotdf$host, plotdf$date)[!duplicated(plotdf$sampleID)])
samedaycount <- unlist(map(paste(plotdf$host, plotdf$date), ~{
ids <- unique(plotdf$sampleID[paste(plotdf$host, plotdf$date) == .x])
ids <- setNames(1:length(ids), ids)
}))
min_date <- plotdf %>% group_by(host) %>% summarise(min_dat = min(date))
min_date <- setNames(min_date$min_dat, min_date$host)
plotdf$date_adj <- plotdf$date - min_date[plotdf$host]
plotdf$date_adj <- plotdf$date_adj + (samedaycount[plotdf$sampleID] - 1) * 0.1 -
(hostdaycount[paste(plotdf$host, plotdf$date)] - 1) * 0.05
ggplot(plotdf, aes(x = date_adj, y = vaf, col = mut_pos)) + geom_line(alpha = 0.5) +
geom_point(alpha = 0.5) + facet_wrap(~host, scales = "free_x") + scale_color_discrete(guide = FALSE) +
scale_x_continuous(breaks = 0:9) + scale_y_sqrt() + theme_bw(base_size = 14) +
xlab("days from first sample") + ylab("VAF")
ggsave(filename = "./Figures/vaf_by_repeated_sample.pdf", width = 15, height = 10)
ggsave(filename = "./Figures/vaf_by_repeated_sample.png", width = 15, height = 10)
To focus on the variance within samples we plot the number of shared variants versus the total number of minority variants for each pairwise combination of samples taken from the same patient on the same day. We also plot the proportion of shared variants between each pairwise combination, split by the different sampling techniques used.
same_days <- table(paste(multi_run_calls_recovered$host, multi_run_calls_recovered$date)[!duplicated(multi_run_calls_recovered$sampleID)])
same_days <- same_days[same_days > 1]
plotdf <- multi_run_calls_recovered[paste(multi_run_calls_recovered$host, multi_run_calls_recovered$date) %in%
names(same_days), ]
plotdf$mut_pos <- paste(plotdf$mut, plotdf$pos)
plotdf2 <- plotdf %>% group_by(host, date) %>% summarise(sample_type = paste(sort(unique(sample_type_collected)),
collapse = "_"), n_samples = length(unique(sampleID)), n_shared = sum(table(mut_pos) ==
length(unique(sampleID))), mean_shared = mean(table(mut_pos)/length(unique(sampleID))),
ids = list(unique(sampleID)), tm = list(table(mut_pos)), n_muts = length(unique(mut_pos)),
prop_shared = sum(table(mut_pos) == length(unique(sampleID)))/length(unique(mut_pos)))
gg1 <- ggplot(plotdf2, aes(x = sample_type, y = prop_shared)) + geom_boxplot(outlier.colour = NA) +
geom_jitter(height = 0, width = 0.1, size = 2) + theme_bw(base_size = 14) + xlab("sample types") +
ylab("proportion of shared minority variants")
gg1
ggsave(filename = "./Figures/swab_type_vs_prop_shared.pdf", width = 9, height = 7)
ggsave(filename = "./Figures/swab_type_vs_prop_shared.png", width = 9, height = 7)
gg2 <- ggplot(plotdf2, aes(x = n_shared, y = n_muts)) + geom_jitter(height = 0.1,
width = 0.1, size = 2) + theme_bw(base_size = 14) + xlab("number of shared minority variant") +
ylab("total number of minority variants")
gg2
ggsave(filename = "./Figures/maf_total_vs_shared.pdf", width = 9, height = 7)
ggsave(filename = "./Figures/maf_total_vs_shared.png", width = 9, height = 7)
gg1 + gg2 + patchwork::plot_layout(nrow = 1)
# test excluding samples with more than two to avoid bias
t.test(plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == "swab"],
plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == "sputum_swab"],
alternative = "greater")
##
## Welch Two Sample t-test
##
## data: plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == and plotdf2$prop_shared[plotdf2$n_samples == 2 & plotdf2$sample_type == "swab"] and "sputum_swab"]
## t = 0.8484, df = 9.6287, p-value = 0.2084
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## -0.1416669 Inf
## sample estimates:
## mean of x mean of y
## 0.4226266 0.2988684
# alternative strategy of including the number of samples in the model leads to a
# similar result.
plotdf2$isswab <- plotdf2$sample_type == "swab"
summary(glm(n_shared ~ isswab + n_samples + n_muts, data = plotdf2, family = "poisson"))
##
## Call:
## glm(formula = n_shared ~ isswab + n_samples + n_muts, family = "poisson",
## data = plotdf2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9422 -0.9009 -0.4691 0.5778 2.1638
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.312078 0.556106 4.158 3.22e-05 ***
## isswabTRUE 0.213813 0.207103 1.032 0.302
## n_samples -0.286469 0.260141 -1.101 0.271
## n_muts 0.003482 0.011060 0.315 0.753
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 30.292 on 16 degrees of freedom
## Residual deviance: 26.173 on 13 degrees of freedom
## AIC: 95.326
##
## Number of Fisher Scoring iterations: 5
Finally, we plot the difference in the number of withinhost minority variants between each pairwise combination of samples where the x-axis indicates the number of days seperating those samples.
pairwise_within_host_maf_diff <- map_dfr(unique(multi_run_calls_recovered$host),
~{
print(.x)
temp <- multi_run_calls %>% filter(host == .x) %>% group_by(sampleID, date) %>%
summarise(maf_count = n()) %>% arrange(date)
if (nrow(temp) < 2)
return(tibble())
pw <- combinat::combn(nrow(temp), 2, simplify = FALSE)
maf_diff <- map_dfr(pw, function(p) {
tibble(date_diff = temp$date[[p[[2]]]] - temp$date[[p[[1]]]], maf_diff = temp$maf_count[[p[[2]]]] -
temp$maf_count[[p[[1]]]])
}) %>% group_by(date_diff) %>% summarise(mean_maf_diff = mean(maf_diff))
maf_diff$host <- .x
return(maf_diff)
})
## [1] "CAMS001914"
## [1] "CAMS001867"
## [1] "CAMS001503"
## [1] "CAMS001487"
## [1] "CAMS001467"
## [1] "CAMS001424"
## [1] "CAMS000787"
## [1] "CAMS000737"
## [1] "CAMS000672"
## [1] "CAMS000668"
## [1] "CAMS001910"
## [1] "CAMS002063"
## [1] "CAMS001538"
## [1] "CAMS001604"
## [1] "CAMS001586"
## [1] "CAMS001653"
## [1] "CAMS001666"
## [1] "CAMS001664"
## [1] "CAMS001670"
## [1] "CAMS001662"
## [1] "CAMS001741"
## [1] "CAMS001715"
## [1] "CAMS001803"
## [1] "CAMS001997"
## [1] "CAMS002135"
## [1] "CAMS002279"
## [1] "CAMS002128"
## [1] "CAMS002472"
## [1] "CAMS002162"
## [1] "CAMS002291"
## [1] "CAMS002186"
## [1] "CAMS002480"
## [1] "CAMS002667"
## [1] "CAMS002621"
## [1] "CAMS002328"
## [1] "CAMS002327"
## [1] "CAMS002325"
## [1] "CAMS002414"
## [1] "CAMS002412"
## [1] "CAMS002435"
## [1] "CAMS002532"
gg3 <- ggplot(pairwise_within_host_maf_diff, aes(x = date_diff, y = mean_maf_diff,
group = date_diff)) + geom_boxplot(outlier.colour = NA) + geom_jitter(height = 0,
width = 0.2) + theme_bw(base_size = 14) + scale_x_continuous(breaks = 0:9) +
xlab("time difference between samples (days)") + ylab("difference in number of minor alleles")
gg3
model_data <- multi_run_calls_recovered %>% group_by(host, date, sampleID, sample_type_collected) %>%
summarise(maf_count = n())
model_data$adj_date <- as.numeric(model_data$date - min_date[model_data$host])
summary(glmer(maf_count ~ adj_date + sample_type_collected + (1 | host), family = "poisson",
data = model_data))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: maf_count ~ adj_date + sample_type_collected + (1 | host)
## Data: model_data
##
## AIC BIC logLik deviance df.resid
## 653.5 666.0 -321.7 643.5 85
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.4443 -0.7832 -0.1684 0.7942 3.0867
##
## Random effects:
## Groups Name Variance Std.Dev.
## host (Intercept) 0.3254 0.5704
## Number of obs: 90, groups: host, 41
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.71420 0.18374 14.772 < 2e-16 ***
## adj_date 0.07559 0.02784 2.715 0.00663 **
## sample_type_collectedsputum 0.07890 0.17547 0.450 0.65296
## sample_type_collectedswab -0.26338 0.16662 -1.581 0.11394
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) adj_dt smpl_typ_cllctdsp
## adj_date -0.091
## smpl_typ_cllctdsp -0.717 -0.064
## smpl_typ_cllctdsw -0.845 -0.037 0.805
ggsave(filename = "./Figures/repeated_samples_maf_count_vs_date.pdf", width = 9,
height = 7)
ggsave(filename = "./Figures/repeated_samples_maf_count_vs_date.png", width = 9,
height = 7)